"""
Author : Mr.Sun
Datetime : 2024/12/6 17:47 
FileName : summar_match.py
Desc : 
"""

import re
import dataset
import cpca
import logging
import jieba
from typing import List, Dict
from data_operate import base_dir
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder


class KeywordSemanticMatch:
    def __init__(self):
        # 数据库连接
        self.db = dataset.connect(
            f'sqlite:///{base_dir}',
            engine_kwargs={'connect_args': {'check_same_thread': True}},
            sqlite_wal_mode=False
        )
        self.query = "select key_word from key_words where status=1"
        self.table = self.db['key_words']

        # 缓存关键词
        self.cached_key_words = None

        # 语义分类类别
        self.categories = {
            "招聘": ["招聘", "岗位", "工作", "机会", "人才", "职位", "编制"],
            "教育": ["教育", "学校", "大学", "学院", "博士", "教师", "高校"],
            "地方": ["省", "市", "区", "县", "乡", "镇"],
            "人才": ["人才", "高端", "精英", "专业", "技术", "引进"],
            "政府": ["政府", "机关", "事业单位", "公务员", "编制"]
        }

        # 大学名称关键词
        self.university_patterns = [
            r'\S+大学',
            r'\S+学院',
            r'\S+高校',
            r'\S+校区'
        ]

        # 排除关键词
        self.exclude_keywords = {
            '应届', '选调生', '优秀高校毕业生', '2024届',
            '实习', '兼职', '临时'
        }

        # 初始化语义分类器
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(list(self.categories.keys()))

        # 构建训练语料
        self.train_corpus = self._build_train_corpus()

        # TF-IDF向量化
        self.vectorizer = TfidfVectorizer()
        self.corpus_vectors = self.vectorizer.fit_transform(self.train_corpus)

    def _get_key_words(self):
        """
        使用缓存机制获取关键词
        """
        if self.cached_key_words is None:
            self.cached_key_words = {x['key_word'] for x in self.db.query(self.query)}
        return self.cached_key_words

    def _build_train_corpus(self) -> List[str]:
        """
        构建训练语料库
        """
        train_data = []
        for category, keywords in self.categories.items():
            train_data.extend([f"{category} {' '.join(keywords)}" for _ in range(5)])
        return train_data

    def semantic_classify(self, text: str, top_k: int = 3) -> List[Dict]:
        """
        语义分类方法
        """
        # 分词处理
        words = jieba.lcut(text)
        text_vector = self.vectorizer.transform([' '.join(words)])

        # 计算余弦相似度
        similarities = cosine_similarity(text_vector, self.corpus_vectors)[0]

        # 按类别聚合相似度
        category_scores = {}
        for category, score in zip(self.train_corpus, similarities):
            cat_name = category.split()[0]
            category_scores[cat_name] = max(category_scores.get(cat_name, 0), score)

        # 对结果排序
        sorted_categories = sorted(
            category_scores.items(),
            key=lambda x: x[1],
            reverse=True
        )

        # 返回详细结果
        results = []
        for category, confidence in sorted_categories[:top_k]:
            if confidence > 0.1:  # 置信度阈值
                results.append({
                    "category": category,
                    "confidence": round(confidence, 2)
                })

        return results

    def is_university_recruitment(self, title):
        """
        判断是否为大学招聘
        """
        # 检查是否包含大学名称模式
        university_match = any(
            re.search(pattern, title)
            for pattern in self.university_patterns
        )

        # 检查是否包含排除关键词
        exclude_match = any(
            keyword in title
            for keyword in self.exclude_keywords
        )

        return university_match and not exclude_match

    def key_word_match(self, title=None) -> Dict:
        """
        综合关键词匹配和语义分类
        """
        if not title:
            return {"match_type": "None", "details": None}

        # 优先检查是否为大学招聘
        if self.is_university_recruitment(title):
            return {
                "match_type": "大学招聘",
                "details": {
                    "semantic_categories": self.semantic_classify(title)
                }
            }

        try:
            # 使用cpca提取省市信息
            df = cpca.transform([title])
            province = df.at[0, '省']
            city = df.at[0, '市']

            # 获取数据库中的关键词
            key_words = self._get_key_words()

            # 地域匹配
            if city in key_words:
                return {
                    "match_type": city,
                    "details": {
                        "semantic_categories": self.semantic_classify(title)
                    }
                }

            if province in key_words:
                return {
                    "match_type": province,
                    "details": {
                        "semantic_categories": self.semantic_classify(title)
                    }
                }

            # 语义分类作为兜底
            semantic_results = self.semantic_classify(title)
            if semantic_results:
                return {
                    "match_type": semantic_results[0]['category'],
                    "details": {
                        "semantic_categories": semantic_results
                    }
                }

        except Exception as e:
            logging.error(f"匹配过程出错: {e}")

        return {"match_type": "None", "details": None}


if __name__ == "__main__":
    matcher = KeywordSemanticMatch()
    result = matcher.key_word_match("2025年安徽蚌埠工商学院教职工招聘公告(第一批)")
    print(f"匹配类型: {result['match_type']}")
    print(f"详细信息: {result['details']}\n")
